%load_ext autoreload
%autoreload 2
import logging
logging.basicConfig(level=logging.DEBUG)
import polars as pl
pl.enable_string_cache() # TODO: remove eventually with proper enums
from zipfile import ZipFile
from pathlib import Path
from polars import col as C
import plotly.express as px
from ctrace.constants import *
import ctrace as ct
import tempfile
_logger = logging.getLogger(__name__)
# The zip files released by climate trace
_files = [
"agriculture.zip",
"buildings.zip",
"fluorinated_gases.zip",
"forestry_and_land_use.zip",
"fossil_fuel_operations.zip",
"manufacturing.zip",
"mineral_extraction.zip",
"power.zip",
"transportation.zip",
"waste.zip"
]
def _recast_parquet(df) -> pl.DataFrame:
cf_cols = [SOURCE_TYPE, CAPACITY, CAPACITY_FACTOR, ACTIVITY, CO2_EMISSIONS_FACTOR,
CH4_EMISSIONS_FACTOR, N2O_EMISSIONS_FACTOR, CO2_EMISSIONS, CH4_EMISSIONS,
N2O_EMISSIONS, TOTAL_CO2E_20YRGWP, TOTAL_CO2E_100YRGWP]
return (df.with_columns(
c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
c_gas.cast(ct.gas_enum, strict=False).alias(GAS),
# TODO: change eventually to enum
c_original_inventory_sector.cast(pl.String).alias(ORIGINAL_INVENTORY_SECTOR),
c_temporal_granularity.cast(ct.temporal_granularity_enum).alias(TEMPORAL_GRANULARITY),
).with_columns(
*[C("conf_" + col_name).cast(pl.Categorical, strict=False).alias("conf_" + col_name) for col_name in cf_cols]
))
def _load_sources(fp) -> pl.DataFrame:
dates = [
START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
]
uint64s = [SOURCE_ID]
floats = [EMISSIONS_QUANTITY, EMISSIONS_FACTOR, CAPACITY, CAPACITY_FACTOR, ACTIVITY, LAT, LON]
enums = {
ISO3_COUNTRY:ct.iso3_enum,
GAS:ct.gas_enum,
TEMPORAL_GRANULARITY:ct.temporal_granularity_enum,
# TODO: activate once the whole list is gathered
# ORIGINAL_INVENTORY_SECTOR:ct.inventory_sector_enum
}
# TODO: make it lazy
df = pl.read_csv(fp.read(), infer_schema_length=0) #.lazy()
num_other = 12
check_cols = [ACTIVITY, ACTIVITY_UNITS, EMISSIONS_FACTOR, EMISSIONS_FACTOR_UNITS,
CAPACITY_UNITS, CAPACITY, CAPACITY_FACTOR, GEOMETRY_REF, LAT, LON, ORIGINAL_INVENTORY_SECTOR] + [f"other{i}" for i in range(1,num_other+1)] + [f"other{i}_def" for i in range(1,num_other+1)]
for col_name in check_cols:
if col_name not in df.columns:
df = df.with_columns(pl.lit(None).cast(pl.String, strict=False).alias(col_name))
# Check that the columns match exactly
s1 = set(df.columns)
s2 = set(all_columns)
assert s1 == s2, (s1-s2, s2-s1, list(zip(sorted(df.columns), sorted(all_columns))))
df = df.select(*all_columns)
return (df.with_columns(
# Only start_time and end_time are required
*[C(col_name).str.to_datetime(strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates])
.with_columns(
c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
c_gas.cast(ct.gas_enum).alias(GAS),
c_temporal_granularity.cast(ct.temporal_granularity_enum).alias("temporal_granularity"),
# TODO: change eventually to enum
# c_original_inventory_sector.cast(pl.Categorical).alias(ORIGINAL_INVENTORY_SECTOR)
).with_columns(*[C(col_name).cast(pl.Float64, strict=False).alias(col_name) for col_name in floats])
.with_columns(*[C(col_name).cast(pl.UInt64).alias(col_name) for col_name in uint64s])
# This is for debugging the memory consumption, remove eventually
.limit(1_000_000_000)
)
def _load_source_confidence(fp) -> pl.DataFrame:
dates = [
START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
]
cf_cols = [SOURCE_TYPE, CAPACITY, CAPACITY_FACTOR, ACTIVITY, CO2_EMISSIONS_FACTOR,
CH4_EMISSIONS_FACTOR, N2O_EMISSIONS_FACTOR, CO2_EMISSIONS, CH4_EMISSIONS,
N2O_EMISSIONS, TOTAL_CO2E_20YRGWP, TOTAL_CO2E_100YRGWP]
uint64s = [SOURCE_ID]
# TODO: make it lazy
df = pl.read_csv(fp.read(), infer_schema_length=0) #.lazy()
sels = ([C(col_name).str.to_datetime(strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates]
+ [c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY), c_source_id.cast(pl.UInt64).alias(SOURCE_ID)]
+ [C(col_name).cast(pl.Categorical, strict=False).alias("conf_"+col_name) for col_name in cf_cols]
)
df = df.select(*sels)
# For debugging
return df #.limit(1_000)
def _load_source_conf(s_fp, c_fp) -> pl.DataFrame:
s_df = _load_sources(s_fp)
c_df = _load_source_confidence(c_fp)
return (s_df.join(c_df.drop(["created_date", "modified_date"]),
on=["start_time", "end_time", "iso3_country", "source_id"],
how="left")
)
def read_all_sources(p: Path) -> pl.DataFrame:
res_df = None
# dfs = []
# TODO: replace by a proper temp directory
tmp_dir = Path(tempfile.gettempdir())
data_files = []
for fname in _files:
_logger.debug(f"open {fname}")
zf = ZipFile(Path(ct_data_source) / fname)
source_names = [n for n in zf.namelist() if n.endswith("-sources.csv")]
_logger.debug(f"sources: {source_names}")
for sname in source_names:
_logger.debug(f"opening {fname} / {sname}")
c_name = sname.replace("_emissions-sources.csv", "_emissions-sources_confidence.csv")
_logger.debug(f"opening {fname} / {c_name}")
df = _load_source_conf(zf.open(sname), zf.open(c_name))
df = (df.with_columns(
pl.lit(fname.replace(".zip", "")).cast(pl.Categorical).alias("ct_package"),
pl.lit(sname.replace("_emissions-sources.csv", "")).cast(pl.Categorical).alias("ct_file")))
tmp_name = tmp_dir / sname.replace(".csv",".parquet")
df.write_parquet(tmp_name)
data_files.append(tmp_name)
_logger.debug(f"wrote {tmp_name}")
dfs = []
for tmp_name in data_files:
_logger.debug(f"scan {tmp_name}")
df = pl.scan_parquet(tmp_name)
df = df.pipe(_recast_parquet)
dfs.append(df)
res_df = pl.concat(dfs)
return res_df
def _load_country_emissions(fp) -> pl.DataFrame:
dates = [
START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
]
floats = [EMISSIONS_QUANTITY]
enums = {
ISO3_COUNTRY:ct.iso3_enum,
GAS:ct.gas_enum,
TEMPORAL_GRANULARITY:ct.temporal_granularity_enum,
# TODO: activate once the whole list is gathered
# ORIGINAL_INVENTORY_SECTOR:ct.inventory_sector_enum
}
df = pl.read_csv(fp.read(), infer_schema_length=0)
all_columns = ['iso3_country',
'start_time',
'end_time',
'original_inventory_sector',
'gas',
'emissions_quantity',
'emissions_quantity_units',
'temporal_granularity',
'created_date',
'modified_date']
# Check that the columns match exactly
s1 = set(df.columns)
s2 = set(all_columns)
assert s1 == s2, (s1-s2, s2-s1, list(zip(sorted(df.columns), sorted(all_columns))))
df = df.select(*all_columns)
return (df.with_columns(
# Only start_time and end_time are required
*[C(col_name).str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f",
strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates])
.with_columns(
c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
c_gas.cast(ct.gas_enum, strict=False).alias(GAS),
# TODO: causing issues
# c_original_inventory_sector.cast(pl.Categorical).alias(ORIGINAL_INVENTORY_SECTOR),
c_temporal_granularity.cast(ct.temporal_granularity_enum).alias(TEMPORAL_GRANULARITY)
).with_columns(*[C(col_name).cast(pl.Float64, strict=False).alias(col_name) for col_name in floats])
)
def read_country_emissions(p: Path) -> pl.DataFrame:
# TODO: add meta info about the provenance: ct_package and ct_file
# So fast there is no need to store a materialized version.
dfs = []
for fname in _files:
_logger.debug(f"open {fname}")
zf = ZipFile(Path(ct_data_source) / fname)
source_names = [n for n in zf.namelist() if n.endswith("_country_emissions.csv")]
_logger.debug(f"sources: {source_names}")
for sname in source_names:
_logger.debug(f"opening {fname} / {sname}")
df = _load_country_emissions(zf.open(sname))
df = (df.with_columns(
pl.lit(fname.replace(".zip", "")).cast(pl.Categorical).alias(CT_PACKAGE),
pl.lit(sname.replace("_country_emissions.csv", "")).cast(pl.Categorical).alias(CT_FILE)))
dfs.append(df)
res_df = pl.concat(dfs)
return res_df
ct_data_source = "/home/tjhunter/Downloads/"
cedf = read_country_emissions(Path(ct_data_source))
if False:
df = read_all_sources(Path(ct_data_source))
# Using snappy because it is more broadly compatible with parquet readers
# Polars 0.20 does not support statistics
for year in range(2015, 2023):
df.filter(c_start_time.dt.year()==year).sink_parquet(
f"/home/tjhunter/Downloads/ct_data_{year}.parquet",
compression="snappy",
statistics=False
)
DEBUG:__main__:open agriculture.zip
DEBUG:__main__:sources: ['enteric-fermentation-other_country_emissions.csv', 'other-agricultural-soil-emissions_country_emissions.csv', 'manure-management-other_country_emissions.csv', 'rice-cultivation_country_emissions.csv', 'enteric-fermentation-cattle-feedlot_country_emissions.csv', 'manure-management-cattle-feedlot_country_emissions.csv', 'synthetic-fertilizer-application_country_emissions.csv', 'cropland-fires_country_emissions.csv', 'enteric-fermentation-cattle-pasture_country_emissions.csv', 'manure-left-on-pasture-cattle_country_emissions.csv']
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-other_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / other-agricultural-soil-emissions_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-management-other_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / rice-cultivation_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-cattle-feedlot_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-management-cattle-feedlot_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / synthetic-fertilizer-application_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / cropland-fires_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-cattle-pasture_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-left-on-pasture-cattle_country_emissions.csv
DEBUG:__main__:open buildings.zip
DEBUG:__main__:sources: ['other-onsite-fuel-usage_country_emissions.csv', 'residential-and-commercial-onsite-fuel-usage_country_emissions.csv']
DEBUG:__main__:opening buildings.zip / other-onsite-fuel-usage_country_emissions.csv
DEBUG:__main__:opening buildings.zip / residential-and-commercial-onsite-fuel-usage_country_emissions.csv
DEBUG:__main__:open fluorinated_gases.zip
DEBUG:__main__:sources: ['fluorinated-gases_country_emissions.csv']
DEBUG:__main__:opening fluorinated_gases.zip / fluorinated-gases_country_emissions.csv
DEBUG:__main__:open forestry_and_land_use.zip
DEBUG:__main__:sources: ['water-reservoirs_country_emissions.csv', 'forest-land-degradation_country_emissions.csv', 'shrubgrass-fires_country_emissions.csv', 'wetland-fires_country_emissions.csv', 'net-shrubgrass_country_emissions.csv', 'forest-land-fires_country_emissions.csv', 'removals_country_emissions.csv', 'net-wetland_country_emissions.csv', 'net-forest-land_country_emissions.csv', 'forest-land-clearing_country_emissions.csv']
DEBUG:__main__:opening forestry_and_land_use.zip / water-reservoirs_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-degradation_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / shrubgrass-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / wetland-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-shrubgrass_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / removals_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-wetland_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-forest-land_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-clearing_country_emissions.csv
DEBUG:__main__:open fossil_fuel_operations.zip
DEBUG:__main__:sources: ['other-fossil-fuel-operations_country_emissions.csv', 'solid-fuel-transformation_country_emissions.csv', 'oil-and-gas-refining_country_emissions.csv', 'coal-mining_country_emissions.csv', 'oil-and-gas-production-and-transport_country_emissions.csv']
DEBUG:__main__:opening fossil_fuel_operations.zip / other-fossil-fuel-operations_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / solid-fuel-transformation_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / oil-and-gas-refining_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / coal-mining_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / oil-and-gas-production-and-transport_country_emissions.csv
DEBUG:__main__:open manufacturing.zip
DEBUG:__main__:sources: ['petrochemicals_country_emissions.csv', 'other-manufacturing_country_emissions.csv', 'pulp-and-paper_country_emissions.csv', 'chemicals_country_emissions.csv', 'aluminum_country_emissions.csv', 'steel_country_emissions.csv', 'cement_country_emissions.csv']
DEBUG:__main__:opening manufacturing.zip / petrochemicals_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / other-manufacturing_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / pulp-and-paper_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / chemicals_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / aluminum_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / steel_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / cement_country_emissions.csv
DEBUG:__main__:open mineral_extraction.zip
DEBUG:__main__:sources: ['rock-quarrying_country_emissions.csv', 'sand-quarrying_country_emissions.csv', 'copper-mining_country_emissions.csv', 'iron-mining_country_emissions.csv', 'bauxite-mining_country_emissions.csv']
DEBUG:__main__:opening mineral_extraction.zip / rock-quarrying_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / sand-quarrying_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / copper-mining_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / iron-mining_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / bauxite-mining_country_emissions.csv
DEBUG:__main__:open power.zip
DEBUG:__main__:sources: ['other-energy-use_country_emissions.csv', 'electricity-generation_country_emissions.csv']
DEBUG:__main__:opening power.zip / other-energy-use_country_emissions.csv
DEBUG:__main__:opening power.zip / electricity-generation_country_emissions.csv
DEBUG:__main__:open transportation.zip
DEBUG:__main__:sources: ['other-transport_country_emissions.csv', 'railways_country_emissions.csv', 'road-transportation_country_emissions.csv', 'international-aviation_country_emissions.csv', 'domestic-aviation_country_emissions.csv', 'international-shipping_country_emissions.csv', 'domestic-shipping_country_emissions.csv']
DEBUG:__main__:opening transportation.zip / other-transport_country_emissions.csv
DEBUG:__main__:opening transportation.zip / railways_country_emissions.csv
DEBUG:__main__:opening transportation.zip / road-transportation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / international-aviation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / domestic-aviation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / international-shipping_country_emissions.csv
DEBUG:__main__:opening transportation.zip / domestic-shipping_country_emissions.csv
DEBUG:__main__:open waste.zip
DEBUG:__main__:sources: ['incineration-and-open-burning-of-waste_country_emissions.csv', 'biological-treatment-of-solid-waste-and-biogenic_country_emissions.csv', 'solid-waste-disposal_country_emissions.csv', 'wastewater-treatment-and-discharge_country_emissions.csv']
DEBUG:__main__:opening waste.zip / incineration-and-open-burning-of-waste_country_emissions.csv
DEBUG:__main__:opening waste.zip / biological-treatment-of-solid-waste-and-biogenic_country_emissions.csv
DEBUG:__main__:opening waste.zip / solid-waste-disposal_country_emissions.csv
DEBUG:__main__:opening waste.zip / wastewater-treatment-and-discharge_country_emissions.csv
cedf
shape: (512_110, 12)
| iso3_country | start_time | end_time | original_inventory_sector | gas | emissions_quantity | emissions_quantity_units | temporal_granularity | created_date | modified_date | ct_package | ct_file |
|---|---|---|---|---|---|---|---|---|---|---|---|
| enum | datetime[ns] | datetime[ns] | str | enum | f64 | str | enum | datetime[ns] | datetime[ns] | cat | cat |
| "AUT" | 2022-01-01 00:00:00 | 2022-12-31 00:00:00 | "enteric-fermen… | "n2o" | 0.0 | "tonnes" | null | 2023-10-23 11:43:08.395355 | null | "agriculture" | "enteric-fermen… |
| "AZE" | 2022-01-01 00:00:00 | 2022-12-31 00:00:00 | "enteric-fermen… | "co2" | 0.0 | "tonnes" | null | 2023-10-23 11:43:08.395355 | null | "agriculture" | "enteric-fermen… |
| "ABW" | 2022-01-01 00:00:00 | 2022-12-31 00:00:00 | "enteric-fermen… | "co2" | 0.0 | "tonnes" | null | 2023-10-23 11:43:08.395355 | null | "agriculture" | "enteric-fermen… |
| "ABW" | 2022-01-01 00:00:00 | 2022-12-31 00:00:00 | "enteric-fermen… | "n2o" | 0.0 | "tonnes" | null | 2023-10-23 11:43:08.395355 | null | "agriculture" | "enteric-fermen… |
| "ABW" | 2022-01-01 00:00:00 | 2022-12-31 00:00:00 | "enteric-fermen… | "ch4" | 0.0 | "tonnes" | null | 2023-10-23 11:43:08.395355 | null | "agriculture" | "enteric-fermen… |
| … | … | … | … | … | … | … | … | … | … | … | … |
| "TUV" | 2016-01-01 00:00:00 | 2016-12-31 00:00:00 | "wastewater-tre… | "co2e_100yr" | 2516.497358 | "tonnes" | "annual" | 2023-10-06 14:56:33.428328 | 2023-11-03 15:13:27.777139 | "waste" | "wastewater-tre… |
| "UGA" | 2016-01-01 00:00:00 | 2016-12-31 00:00:00 | "wastewater-tre… | "co2e_20yr" | 1.6999e7 | "tonnes" | "annual" | 2023-10-06 14:56:33.428328 | 2023-11-03 15:13:28.159334 | "waste" | "wastewater-tre… |
| "UGA" | 2015-01-01 00:00:00 | 2015-12-31 00:00:00 | "wastewater-tre… | "co2e_20yr" | 1.6456e7 | "tonnes" | "annual" | 2023-10-06 14:56:33.428328 | 2023-11-03 15:13:34.328448 | "waste" | "wastewater-tre… |
| "ZWE" | 2015-01-01 00:00:00 | 2015-12-31 00:00:00 | "wastewater-tre… | "co2e_20yr" | 6.1586e6 | "tonnes" | "annual" | 2023-10-06 14:56:33.428328 | 2023-11-03 15:13:34.624499 | "waste" | "wastewater-tre… |
| "ZWE" | 2015-01-01 00:00:00 | 2015-12-31 00:00:00 | "wastewater-tre… | "co2e_100yr" | 2.1487e6 | "tonnes" | "annual" | 2023-10-06 14:56:33.428328 | 2023-11-03 15:13:34.624499 | "waste" | "wastewater-tre… |
Country checks#
gas = "co2e_100yr"
year = 2022
cedf_gy = cedf.filter(c_gas == gas).filter(c_start_time.dt.year()==year)
((cedf_gy
.filter(c_emissions_quantity > 0) # I think this is the most relevant, but to check
)[EMISSIONS_QUANTITY].sum(),
(cedf_gy
)[EMISSIONS_QUANTITY].sum())
(76430900242.65804, 55171507719.83738)
px.bar(cedf_gy
.filter(c_emissions_quantity > 0)
.group_by(c_iso3_country)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.head(20)
,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,log_y=True)
# Mozambique emits more than Germany??
px.bar(cedf_gy
.group_by(c_iso3_country, c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.filter(c_iso3_country.is_in([
"CHN", "USA", "IND", "RUS",
# "MOZ",
"FRA", "NLD"]))
,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,color=ORIGINAL_INVENTORY_SECTOR,log_y=False)
# Chinese forests remove more than all of france emissions
px.bar(cedf_gy
.group_by(c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.head(10)
,x=ORIGINAL_INVENTORY_SECTOR, y=EMISSIONS_QUANTITY,log_y=False)
px.bar(cedf_gy
.group_by(c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.tail(5)
,x=ORIGINAL_INVENTORY_SECTOR, y=EMISSIONS_QUANTITY,log_y=False)
Source analysis#
# Parquet loses enum structures.
sdf = (pl.scan_parquet(f"/home/tjhunter/Downloads/ct_data_{year}.parquet").pipe(_recast_parquet)
)
sdf_gy = sdf.filter(c_gas == gas) #.collect()
sdf.select((c_start_time.dt.year())).min().collect() #.group_by("conf_total_co2e_100yrgwp").count().collect()
shape: (1, 1)
| start_time |
|---|
| i32 |
| 2022 |
Number of distinct sources for last year
sdf_gy.select(c_source_id.unique_counts()).count().collect().item()
627582
(sdf.filter(c_original_inventory_sector.is_null())
.group_by(c_start_time.dt.year(), C("ct_package"), C("ct_file"))
.agg(pl.count())
.collect())
/tmp/ipykernel_1880515/3687814498.py:3: DeprecationWarning:
`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (9, 4)
| start_time | ct_package | ct_file | count |
|---|---|---|---|
| i32 | cat | cat | u32 |
| 2022 | "forestry_and_l… | "wetland-fires" | 32730 |
| 2022 | "forestry_and_l… | "forest-land-cl… | 211315 |
| 2022 | "forestry_and_l… | "shrubgrass-fir… | 93935 |
| 2022 | "forestry_and_l… | "net-wetland" | 213760 |
| 2022 | "forestry_and_l… | "net-shrubgrass… | 248045 |
| 2022 | "forestry_and_l… | "forest-land-fi… | 81120 |
| 2022 | "forestry_and_l… | "net-forest-lan… | 248300 |
| 2022 | "forestry_and_l… | "forest-land-de… | 94420 |
| 2022 | "forestry_and_l… | "removals" | 250620 |
px.bar(sdf
.select(c_original_inventory_sector.value_counts(sort=True))
.collect()
.unnest(ORIGINAL_INVENTORY_SECTOR),
x=ORIGINAL_INVENTORY_SECTOR, y="count", log_y=True)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
134 try:
--> 135 module = importlib.import_module(name)
136 except ImportError:
File /usr/lib/python3.10/importlib/__init__.py:126, in import_module(name, package)
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1050, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1027, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1004, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'pyarrow'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py:1436, in build_dataframe(args, constructor)
1435 columns = list(necessary_columns)
-> 1436 args["data_frame"] = pd.api.interchange.from_dataframe(
1437 args["data_frame"].select_columns_by_name(columns)
1438 )
1439 except (ImportError, NotImplementedError) as exc:
1440 # temporary workaround; developers of third-party libraries themselves
1441 # should try a different implementation, if available. For example:
1442 # def __dataframe__(self, ...):
1443 # if not some_condition:
1444 # self.to_pandas(...)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:71, in from_dataframe(df, allow_copy)
69 raise ValueError("`df` does not support __dataframe__")
---> 71 return _from_dataframe(
72 df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
73 )
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:94, in _from_dataframe(df, allow_copy)
93 for chunk in df.get_chunks():
---> 94 pandas_df = protocol_df_chunk_to_pandas(chunk)
95 pandas_dfs.append(pandas_df)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:148, in protocol_df_chunk_to_pandas(df)
147 elif dtype == DtypeKind.STRING:
--> 148 columns[name], buf = string_column_to_ndarray(col)
149 elif dtype == DtypeKind.DATETIME:
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:300, in string_column_to_ndarray(col)
299 valid_buff, valid_dtype = buffers["validity"]
--> 300 null_pos = buffer_to_ndarray(
301 valid_buff, valid_dtype, offset=col.offset, length=col.size()
302 )
303 if sentinel_val == 0:
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:445, in buffer_to_ndarray(buffer, dtype, length, offset)
444 assert length is not None, "`length` must be specified for a bit-mask buffer."
--> 445 pa = import_optional_dependency("pyarrow")
446 arr = pa.BooleanArray.from_buffers(
447 pa.bool_(),
448 length,
449 [None, pa.foreign_buffer(buffer.ptr, length)],
450 offset=offset,
451 )
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/compat/_optional.py:138, in import_optional_dependency(name, extra, errors, min_version)
137 if errors == "raise":
--> 138 raise ImportError(msg)
139 return None
ImportError: Missing optional dependency 'pyarrow'. Use pip or conda to install pyarrow.
During handling of the above exception, another exception occurred:
ModuleNotFoundError Traceback (most recent call last)
/tmp/ipykernel_1880515/786101580.py in ?()
----> 1 px.bar(sdf
2 .select(c_original_inventory_sector.value_counts(sort=True))
3 .collect()
4 .unnest(ORIGINAL_INVENTORY_SECTOR),
~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_chart_types.py in ?(data_frame, x, y, color, pattern_shape, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, hover_name, hover_data, custom_data, text, base, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, color_discrete_sequence, color_discrete_map, color_continuous_scale, pattern_shape_sequence, pattern_shape_map, range_color, color_continuous_midpoint, opacity, orientation, barmode, log_x, log_y, range_x, range_y, text_auto, title, template, width, height)
369 """
370 In a bar plot, each row of `data_frame` is represented as a rectangular
371 mark.
372 """
--> 373 return make_figure(
374 args=locals(),
375 constructor=go.Bar,
376 trace_patch=dict(textposition="auto"),
~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py in ?(args, constructor, trace_patch, layout_patch)
2086 trace_patch = trace_patch or {}
2087 layout_patch = layout_patch or {}
2088 apply_default_cascade(args)
2089
-> 2090 args = build_dataframe(args, constructor)
2091 if constructor in [go.Treemap, go.Sunburst, go.Icicle] and args["path"] is not None:
2092 args = process_dataframe_hierarchy(args)
2093 if constructor in [go.Pie]:
~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py in ?(args, constructor)
1446 args["data_frame"] = df_not_pandas.toPandas()
1447 elif hasattr(df_not_pandas, "to_pandas_df"):
1448 args["data_frame"] = df_not_pandas.to_pandas_df()
1449 elif hasattr(df_not_pandas, "to_pandas"):
-> 1450 args["data_frame"] = df_not_pandas.to_pandas()
1451 else:
1452 raise exc
1453
~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/polars/dataframe/frame.py in ?(self, use_pyarrow_extension_array, **kwargs)
2257 return self._to_pandas_with_object_columns(
2258 use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2259 )
2260
-> 2261 return self._to_pandas_without_object_columns(
2262 self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2263 )
~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/polars/dataframe/frame.py in ?(self, df, use_pyarrow_extension_array, **kwargs)
2308 ) -> pd.DataFrame:
2309 if not df.width: # Empty dataframe, cannot infer schema from batches
2310 return pd.DataFrame()
2311
-> 2312 record_batches = df._df.to_pandas()
2313 tbl = pa.Table.from_batches(record_batches)
2314 if use_pyarrow_extension_array:
2315 return tbl.to_pandas(
ModuleNotFoundError: No module named 'pyarrow'
px.bar(sdf.select(C("ct_file").value_counts(sort=True)).collect()
.unnest("ct_file"),
x="ct_file", y="count",log_y=True)
(sdf_gy
.group_by(c_original_inventory_sector)
.agg(c_source_id.count().alias("c"), c_emissions_quantity.sum())
.collect()
.sort("c", descending=True))
shape: (31, 3)
| original_inventory_sector | c | emissions_quantity |
|---|---|---|
| str | u32 | f64 |
| null | 294849 | -3.8651e9 |
| "domestic-shipp… | 213983 | 1.9471e8 |
| "international-… | 139957 | 5.1798e8 |
| "wastewater-tre… | 53466 | 1.3067e8 |
| "cropland-fires… | 47940 | 3.5394e9 |
| "manure-left-on… | 46245 | 1.1210e9 |
| "enteric-fermen… | 46245 | 3.2797e9 |
| "synthetic-fert… | 43209 | 1.3111e9 |
| "cement" | 27108 | 1.8021e9 |
| "road-transport… | 16922 | 3.1400e9 |
| "steel" | 10920 | 1.7520e9 |
| "solid-waste-di… | 10314 | 1.0297e9 |
| … | … | … |
| "enteric-fermen… | 4639 | 5.0844e7 |
| "coal-mining" | 3164 | 1.4659e9 |
| "aluminum" | 2760 | 3.49556024e8 |
| "other-manufact… | 2115 | 2.7386e8 |
| "oil-and-gas-pr… | 1647 | 8.4849e8 |
| "oil-and-gas-re… | 684 | 9.7051e8 |
| "oil-and-gas-pr… | 660 | 3.4766e9 |
| "copper-mining" | 555 | 8.3548502e7 |
| "iron-mining" | 536 | 7.1405542e7 |
| "pulp-and-paper… | 387 | 8.3640643e7 |
| "bauxite-mining… | 175 | 8.114666e6 |
| "petrochemicals… | 116 | 1.4665e8 |
(sdf_gy
.group_by(c_iso3_country, c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.filter(c_iso3_country.is_in(["CHN", "USA", "FRA"]))
.collect()
)
shape: (83, 3)
| iso3_country | original_inventory_sector | emissions_quantity |
|---|---|---|
| enum | str | f64 |
| "CHN" | "electricity-ge… | 4.5421e9 |
| "USA" | "electricity-ge… | 1.4893e9 |
| "CHN" | "coal-mining" | 1.0123e9 |
| "CHN" | "steel" | 9.22971074e8 |
| "USA" | "oil-and-gas-pr… | 8.5006e8 |
| "USA" | "oil-and-gas-pr… | 8.4849e8 |
| "USA" | "road-transport… | 8.2750e8 |
| "CHN" | "cement" | 7.046727e8 |
| "CHN" | "road-transport… | 4.0347e8 |
| "CHN" | "rice-cultivati… | 3.9366e8 |
| "CHN" | "synthetic-fert… | 2.9521e8 |
| "CHN" | "chemicals" | 2.6414734e8 |
| … | … | … |
| "CHN" | "enteric-fermen… | 2.6370e6 |
| "FRA" | "aluminum" | 2.626109e6 |
| "FRA" | "solid-waste-di… | 2.569196e6 |
| "FRA" | "domestic-shipp… | 2.1999e6 |
| "FRA" | "domestic-aviat… | 1.8067e6 |
| "FRA" | "pulp-and-paper… | 938314.0 |
| "CHN" | "bauxite-mining… | 830360.0 |
| "FRA" | "water-reservoi… | 106100.867704 |
| "FRA" | "oil-and-gas-pr… | 86660.769884 |
| "FRA" | "rice-cultivati… | 0.0 |
| "FRA" | null | -4.5304e7 |
| "CHN" | null | -6.4086e8 |
px.bar(
(sdf_gy
.filter(c_emissions_quantity > 0 )
.group_by(c_iso3_country, c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.filter(c_iso3_country.is_in(["CHN", "USA", "FRA", "NLD", "IND"]))
.collect()
)
,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,color=ORIGINAL_INVENTORY_SECTOR,log_y=False)
px.bar(
(sdf_gy
.group_by(c_original_inventory_sector)
.agg(c_emissions_quantity.sum())
.sort([c_emissions_quantity], descending=True)
.collect()
),
x=ORIGINAL_INVENTORY_SECTOR,y="emissions_quantity",log_y=True)
Joint analysis#
cedf_gy_agg = (cedf_gy
.group_by(c_iso3_country, c_ct_package, c_ct_file)
.agg(c_emissions_quantity.sum())
)
sdf_gy_agg = (sdf_gy
.group_by(c_iso3_country, c_ct_package, c_ct_file)
.agg(c_emissions_quantity.sum())
.collect()
)
Total emissions from country inventory
(cedf_gy_agg
.with_columns((c_emissions_quantity > 0).alias("is_emission"))
.group_by("is_emission")
.agg(c_emissions_quantity.sum())
.sort("is_emission")
)
shape: (2, 2)
| is_emission | emissions_quantity |
|---|---|
| bool | f64 |
| false | -2.1259e10 |
| true | 7.6431e10 |
Total emissions from source inventory
(sdf_gy_agg
.with_columns((c_emissions_quantity > 0).alias("is_emission"))
.group_by("is_emission")
.agg(c_emissions_quantity.sum())
.sort("is_emission")
)
shape: (2, 2)
| is_emission | emissions_quantity |
|---|---|
| bool | f64 |
| false | -5.7529e10 |
| true | 9.5150e10 |
TODO: there is a significant discrepancy between the total reported for each report.
EMISSIONS_QUANTITY_S = EMISSIONS_QUANTITY + "_s"
EMISSIONS_QUANTITY_CE = EMISSIONS_QUANTITY + "_ce"
jdf = (cedf_gy_agg
.join(sdf_gy_agg, on=[ISO3_COUNTRY, CT_PACKAGE, CT_FILE], suffix="_s", how="outer")
.with_columns(
c_iso3_country.fill_null(C(ISO3_COUNTRY+"_s")),
c_ct_package.fill_null(C(CT_PACKAGE+"_s")),
c_ct_file.fill_null(C(CT_FILE+"_s")),
c_emissions_quantity.alias(EMISSIONS_QUANTITY_CE),
).select(
c_iso3_country,
c_ct_package,
c_ct_file,
EMISSIONS_QUANTITY_S,
EMISSIONS_QUANTITY_CE
))
jdf
shape: (12_916, 5)
| iso3_country | ct_package | ct_file | emissions_quantity_s | emissions_quantity_ce |
|---|---|---|---|---|
| enum | cat | cat | f64 | f64 |
| "ABW" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "CUW" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "SWZ" | "agriculture" | "enteric-fermen… | null | 47815.6 |
| "VAT" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "VGB" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "IOT" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "MAR" | "agriculture" | "enteric-fermen… | null | 4.4698e6 |
| "MNP" | "agriculture" | "enteric-fermen… | null | 0.0 |
| "ALB" | "agriculture" | "enteric-fermen… | null | 517546.4 |
| "BEL" | "agriculture" | "enteric-fermen… | null | 308582.4 |
| "COD" | "agriculture" | "enteric-fermen… | null | 731715.6 |
| "CYP" | "agriculture" | "enteric-fermen… | null | 88902.8 |
| … | … | … | … | … |
| "GIN" | "agriculture" | "synthetic-fert… | 4917.187885 | null |
| "HKG" | "agriculture" | "enteric-fermen… | 0.0 | null |
| "LIE" | "agriculture" | "enteric-fermen… | 0.0 | null |
| "SUR" | "agriculture" | "synthetic-fert… | 494.277044 | null |
| "BRN" | "agriculture" | "synthetic-fert… | 8130.169221 | null |
| "BFA" | "agriculture" | "synthetic-fert… | 3200.893827 | null |
| "BWA" | "agriculture" | "synthetic-fert… | 3075.866695 | null |
| "ZNC" | "forestry_and_l… | "net-shrubgrass… | -82226.761935 | null |
| "ZNC" | "forestry_and_l… | "forest-land-cl… | 2417.281475 | null |
| "TLS" | "agriculture" | "synthetic-fert… | 8746.57223 | null |
| "TKM" | "agriculture" | "synthetic-fert… | 46194.154235 | null |
| "MOZ" | "agriculture" | "synthetic-fert… | 6825.984899 | null |
Sanity check to ensure that no emission was lost along the way
jdf.filter((c_iso3_country == "GRC") & (c_ct_file == "electricity-generation"))
shape: (1, 5)
| iso3_country | ct_package | ct_file | emissions_quantity_s | emissions_quantity_ce |
|---|---|---|---|---|
| enum | cat | cat | f64 | f64 |
| "GRC" | "power" | "electricity-ge… | 1.7217e7 | 1.8314e7 |
# cedf_gy.filter((c_iso3_country == "GRC") & (c_ct_file == "electricity-generation"))
list(cedf_gy.filter((c_iso3_country == "GRC") & (c_ct_package == "power")).tail(1))
[shape: (1,)
Series: 'iso3_country' [enum]
[
"GRC"
],
shape: (1,)
Series: 'start_time' [datetime[ns]]
[
2022-01-01 00:00:00
],
shape: (1,)
Series: 'end_time' [datetime[ns]]
[
2022-12-31 00:00:00
],
shape: (1,)
Series: 'original_inventory_sector' [str]
[
"electricity-ge…
],
shape: (1,)
Series: 'gas' [enum]
[
"co2e_100yr"
],
shape: (1,)
Series: 'emissions_quantity' [f64]
[
1.8314e7
],
shape: (1,)
Series: 'emissions_quantity_units' [str]
[
"tonnes"
],
shape: (1,)
Series: 'temporal_granularity' [enum]
[
"annual"
],
shape: (1,)
Series: 'created_date' [datetime[ns]]
[
2023-10-03 21:10:38.966803
],
shape: (1,)
Series: 'modified_date' [datetime[ns]]
[
2023-10-31 20:30:14.210153
],
shape: (1,)
Series: 'ct_package' [cat]
[
"power"
],
shape: (1,)
Series: 'ct_file' [cat]
[
"electricity-generation"
]]
(jdf
.with_columns(((C(EMISSIONS_QUANTITY_S) > 0) | (C(EMISSIONS_QUANTITY_CE) > 0)).alias("is_emission"))
.group_by("is_emission")
.agg(C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum()))
shape: (3, 3)
| is_emission | emissions_quantity_s | emissions_quantity_ce |
|---|---|---|
| bool | f64 | f64 |
| true | 9.4461e10 | 7.6428e10 |
| false | -5.6840e10 | -2.1257e10 |
| null | -461722.074512 | 0.0 |
jdf_cmp = (jdf
# .drop_nulls()
.group_by(c_ct_package, c_ct_file, c_iso3_country)
.agg(C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum())
.with_columns((C(EMISSIONS_QUANTITY_S)/C(EMISSIONS_QUANTITY_CE)).alias("found"))
)
jdf_cmp
shape: (12_916, 6)
| ct_package | ct_file | iso3_country | emissions_quantity_s | emissions_quantity_ce | found |
|---|---|---|---|---|---|
| cat | cat | enum | f64 | f64 | f64 |
| "agriculture" | "enteric-fermen… | "CYP" | 0.0 | 88902.8 | 0.0 |
| "agriculture" | "enteric-fermen… | "ERI" | 0.0 | 1.090222e6 | 0.0 |
| "agriculture" | "enteric-fermen… | "KOR" | 0.0 | 378792.4 | 0.0 |
| "agriculture" | "enteric-fermen… | "LTU" | 0.0 | 63394.8 | 0.0 |
| "agriculture" | "manure-managem… | "ARM" | 0.0 | 33541.4 | 0.0 |
| "agriculture" | "manure-managem… | "MWI" | 0.0 | 1194218.3 | 0.0 |
| "agriculture" | "manure-managem… | "PER" | 0.0 | 831675.7 | 0.0 |
| "agriculture" | "manure-managem… | "PRI" | 0.0 | 21196.7 | 0.0 |
| "agriculture" | "rice-cultivati… | "JAM" | 0.0 | 0.0 | NaN |
| "agriculture" | "rice-cultivati… | "POL" | 0.0 | 0.0 | NaN |
| "agriculture" | "rice-cultivati… | "FLK" | 0.0 | 0.0 | NaN |
| "agriculture" | "rice-cultivati… | "HMD" | 0.0 | 0.0 | NaN |
| … | … | … | … | … | … |
| "forestry_and_l… | "net-wetland" | "ZNC" | -1711.004657 | 0.0 | -inf |
| "agriculture" | "synthetic-fert… | "GUY" | 1494.081984 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "BLZ" | 3115.31478 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "AND" | 17.053587 | 0.0 | inf |
| "agriculture" | "enteric-fermen… | "AND" | 0.0 | 0.0 | NaN |
| "agriculture" | "synthetic-fert… | "NAM" | 807.056201 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "XKX" | 2155.818296 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "MCO" | 0.005225 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "PNG" | 24.142333 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "BOL" | 184266.189547 | 0.0 | inf |
| "agriculture" | "synthetic-fert… | "OMN" | 0.0 | 0.0 | NaN |
| "agriculture" | "enteric-fermen… | "HKG" | 0.0 | 0.0 | NaN |
px.scatter(jdf_cmp, x=EMISSIONS_QUANTITY_CE, y=EMISSIONS_QUANTITY_S,
color=CT_PACKAGE, # Does not seem to work
hover_name=ISO3_COUNTRY,
hover_data=[CT_PACKAGE, CT_FILE],
log_x=True, log_y=True)
(jdf_cmp
# .group_by(c_original_inventory_sector)
.select(
C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum()
))
shape: (1, 2)
| emissions_quantity_s | emissions_quantity_ce |
|---|---|
| f64 | f64 |
| 3.7621e10 | 5.5172e10 |
px.histogram(jdf_cmp, x="found",
color=CT_PACKAGE,
range_x=[0,3],
nbins=5000)
# px.bar(
# sdf_gy.group_by(["conf_co2_emissions_factor", c_original_inventory_sector]).count().collect()
# ,facet_col="conf_co2_emissions_factor",y="count",facet_row=ORIGINAL_INVENTORY_SECTOR
# )
Finding consistent estimates with pymc#
subs_df = (jdf_cmp
.filter(c_ct_file == "electricity-generation")
.filter((C("found") > 0.1) & (C("found") < 10))
.drop_nulls()
.sort(by=EMISSIONS_QUANTITY_S))
px.histogram(subs_df, x=EMISSIONS_QUANTITY_CE,
hover_name=ISO3_COUNTRY,
# log_x=True,
# range_x=[0,3],
nbins=500)
subs_df
shape: (163, 6)
| ct_package | ct_file | iso3_country | emissions_quantity_s | emissions_quantity_ce | found |
|---|---|---|---|---|---|
| cat | cat | enum | f64 | f64 | f64 |
| "power" | "electricity-ge… | "SLE" | 5000.0 | 16000.0 | 0.3125 |
| "power" | "electricity-ge… | "UGA" | 15000.0 | 81000.0 | 0.185185 |
| "power" | "electricity-ge… | "GNB" | 27000.0 | 67000.0 | 0.402985 |
| "power" | "electricity-ge… | "TCD" | 33000.0 | 249000.0 | 0.13253 |
| "power" | "electricity-ge… | "DJI" | 38000.0 | 49000.0 | 0.77551 |
| "power" | "electricity-ge… | "NER" | 65000.0 | 311000.0 | 0.209003 |
| "power" | "electricity-ge… | "BEN" | 76000.0 | 166000.0 | 0.457831 |
| "power" | "electricity-ge… | "NAM" | 81000.0 | 89000.0 | 0.910112 |
| "power" | "electricity-ge… | "COD" | 83000.0 | 321000.0 | 0.258567 |
| "power" | "electricity-ge… | "GIN" | 86000.0 | 640000.0 | 0.134375 |
| "power" | "electricity-ge… | "GMB" | 108000.0 | 255000.0 | 0.423529 |
| "power" | "electricity-ge… | "RWA" | 112000.0 | 287000.0 | 0.390244 |
| … | … | … | … | … | … |
| "power" | "electricity-ge… | "TWN" | 1.59274e8 | 1.6489e8 | 0.965941 |
| "power" | "electricity-ge… | "ZAF" | 1.99288e8 | 1.99571e8 | 0.998582 |
| "power" | "electricity-ge… | "IRN" | 2.07061e8 | 2.17963e8 | 0.949982 |
| "power" | "electricity-ge… | "IDN" | 2.32013e8 | 2.46401e8 | 0.941607 |
| "power" | "electricity-ge… | "DEU" | 2.35282e8 | 2.56218e8 | 0.918288 |
| "power" | "electricity-ge… | "SAU" | 2.53227e8 | 2.6305e8 | 0.962657 |
| "power" | "electricity-ge… | "KOR" | 2.55641e8 | 2.62354e8 | 0.974412 |
| "power" | "electricity-ge… | "JPN" | 4.14062e8 | 4.39829e8 | 0.941416 |
| "power" | "electricity-ge… | "RUS" | 5.33325e8 | 5.59267e8 | 0.953614 |
| "power" | "electricity-ge… | "IND" | 1.2134e9 | 1.2486e9 | 0.971774 |
| "power" | "electricity-ge… | "USA" | 1.4893e9 | 1.4955e9 | 0.995854 |
| "power" | "electricity-ge… | "CHN" | 4.5421e9 | 4.6779e9 | 0.970958 |
import arviz as az
import pymc as pm
import numpy as np
logging.getLogger("matplotlib").setLevel("WARNING")
logging.getLogger("filelock").setLevel("WARNING")
DEBUG:matplotlib:matplotlib data path: /home/tjhunter/work/carbonmap/science/climate_trace/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/home/tjhunter/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux
DEBUG:matplotlib:CACHEDIR=/home/tjhunter/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /home/tjhunter/.cache/matplotlib/fontlist-v330.json
WARNING:pytensor.tensor.blas:Using NumPy C-API based implementation for BLAS functions.
basic_model = pm.Model()
em_s = np.array([1.4893, 2.5, 5.0])
em_ce = np.array([1.4955, 1.0, 1.0])
def shiftedGamma(mean, size):
alpha = 8.5
beta = 2.0
# Putting the observed value on the mode of the distribution
mode = (alpha-1) / beta
# Mapping 10.m -> 0, m -> 1.0
return mean * (4/3. - (1.0/3) * (1.0 / mode) * pm.Gamma.dist(alpha=alpha, beta = beta, size=size))
with basic_model:
init_est = em_ce
epsi = pm.TruncatedNormal("epsi", mu = init_est, sigma=0.5 * init_est, lower=init_est * 0.5, upper=init_est * 10.0)
cp = pm.CustomDist("cp", epsi, dist=shiftedGamma, observed=em_ce)
ip = pm.CustomDist("ip", epsi, dist=shiftedGamma, observed=em_s)
# with basic_model:
# mean = 1.0
# alpha = 8.5
# beta = 2.5
# # Putting the observed value on the mode of the distribution
# mode = (alpha-1) / beta
# # Mapping 4.m -> 0, m -> 1.0
# res = pm.Deterministic("pc", mean * (4/3. - (1.0/3) * (1.0 / mode) * pm.Gamma("gam", alpha=alpha, beta = beta)))
with basic_model:
# draw 1000 posterior samples
idata = pm.sample()
INFO:pymc.sampling.mcmc:Auto-assigning NUTS sampler...
INFO:pymc.sampling.mcmc:Initializing NUTS using jitter+adapt_diag...
INFO:pymc.sampling.mcmc:Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc.sampling.mcmc:NUTS: [epsi]
100.00% [8000/8000 00:01<00:00 Sampling 4 chains, 26 divergences]
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1531.2619304730674.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2106.1051656863983.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3722.466882441551.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1305.9654349923946.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1038.5129616417235.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 17536.169913009326.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 5647.055561227237.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1689.8615656169666.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1460.0419096957983.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1055.972918046751.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 5147.260470310525.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 10529.38165208882.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 4228.661923273439.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1271.6685896430693.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3118.8017076741685.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1309.8491936496227.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 35154.4731015717.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3604.2682822398997.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1842.0076232103413.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1817.1840971448275.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1102.8433734595544.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1027.8079879592774.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2330.0343520601264.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2760.5998211858036.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2679.2401135116706.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1478.1124079931085.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
INFO:pymc.sampling.mcmc:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.
ERROR:pymc.stats.convergence:There were 26 divergences after tuning. Increase `target_accept` or reparameterize.
az.plot_trace(idata, combined=True);
az.summary(idata, round_to=2)
| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| epsi[0] | 1.53 | 0.14 | 1.28 | 1.79 | 0.0 | 0.0 | 2814.46 | 2120.88 | 1.0 |
| epsi[1] | 2.25 | 0.14 | 2.02 | 2.50 | 0.0 | 0.0 | 2449.41 | 1774.92 | 1.0 |
| epsi[2] | 4.13 | 0.13 | 3.91 | 4.38 | 0.0 | 0.0 | 3219.41 | 2628.10 | 1.0 |
Analyzing error bounds#
df= (sdf_gy
.group_by([c_iso3_country, c_ct_file, c_conf_total_co2e_100yrgwp, c_ct_package])
.agg(pl.count().alias("count"), c_emissions_quantity.sum())
.collect()
# .sort(by=[ORIGINAL_INVENTORY_SECTOR, "conf_total_co2e_100yrgwp"])
.sort(by=c_emissions_quantity)
)
df
shape: (5_585, 6)
| iso3_country | ct_file | conf_total_co2e_100yrgwp | ct_package | count | emissions_quantity |
|---|---|---|---|---|---|
| enum | cat | cat | cat | u32 | f64 |
| "BRA" | "removals" | "medium" | "forestry_and_l… | 5597 | -4.8601e9 |
| "COD" | "removals" | "medium" | "forestry_and_l… | 267 | -4.6924e9 |
| "AGO" | "removals" | "medium" | "forestry_and_l… | 177 | -3.4949e9 |
| "MOZ" | "removals" | "medium" | "forestry_and_l… | 141 | -3.0537e9 |
| "ZMB" | "removals" | "medium" | "forestry_and_l… | 126 | -2.7422e9 |
| "RUS" | "net-forest-lan… | "medium" | "forestry_and_l… | 2479 | -2.2479e9 |
| "AUS" | "removals" | "medium" | "forestry_and_l… | 551 | -2.2324e9 |
| "BRA" | "net-forest-lan… | "medium" | "forestry_and_l… | 5597 | -1.9387e9 |
| "RUS" | "removals" | "medium" | "forestry_and_l… | 2479 | -1.9316e9 |
| "USA" | "removals" | "medium" | "forestry_and_l… | 3195 | -1.7630e9 |
| "CAF" | "removals" | "medium" | "forestry_and_l… | 69 | -1.5976e9 |
| "BRA" | "net-shrubgrass… | "medium" | "forestry_and_l… | 5597 | -1.2465e9 |
| … | … | … | … | … | … |
| "BRA" | "forest-land-fi… | "low" | "forestry_and_l… | 2338 | 1.1465e9 |
| "USA" | "forest-land-cl… | "low" | "forestry_and_l… | 3004 | 1.1858e9 |
| "IND" | "electricity-ge… | "medium" | "power" | 471 | 1.2134e9 |
| "USA" | "electricity-ge… | "medium" | "power" | 2160 | 1.4784e9 |
| "CAF" | "forest-land-fi… | "low" | "forestry_and_l… | 69 | 2.0777e9 |
| "AUS" | "forest-land-fi… | "low" | "forestry_and_l… | 277 | 2.4736e9 |
| "ZMB" | "forest-land-fi… | "low" | "forestry_and_l… | 126 | 3.1439e9 |
| "BRA" | "forest-land-cl… | "low" | "forestry_and_l… | 5571 | 3.1661e9 |
| "AGO" | "forest-land-fi… | "low" | "forestry_and_l… | 170 | 3.3166e9 |
| "COD" | "forest-land-fi… | "low" | "forestry_and_l… | 219 | 3.6139e9 |
| "MOZ" | "forest-land-fi… | "low" | "forestry_and_l… | 141 | 3.7217e9 |
| "CHN" | "electricity-ge… | "medium" | "power" | 1225 | 4.5421e9 |
import scipy as sp
margins = {"very high": 0.01, "high": 0.05,"medium":0.25, "low": 0.5, "very low": 1.}
# margins = {"very high": 0.01, "high": 0.05,"medium":0.1, "low": 0.15, "very low": 0.2}
ERR_MARGIN = "err_margin"
rankings = (df.with_columns(
(c_conf_total_co2e_100yrgwp.replace(margins, return_dtype=pl.Float32, default=margins["very low"])
* c_emissions_quantity).alias(ERR_MARGIN)
).group_by(c_iso3_country, c_ct_file, c_ct_package)
.agg(c_emissions_quantity.sum(), C(ERR_MARGIN).sum())
.sort(by=ERR_MARGIN, descending=True)
.with_row_index()
)
_plot_data = (rankings
.filter(~(c_ct_package == "forestry_and_land_use")))
_country_order = (_plot_data
.group_by(c_iso3_country)
.agg(C(ERR_MARGIN).sum())
.sort(by=ERR_MARGIN, descending=True)[ISO3_COUNTRY].to_list())
px.bar(_plot_data.filter(c_iso3_country.is_in(_country_order[:10])), x=ISO3_COUNTRY,
y=ERR_MARGIN,
color=CT_PACKAGE,
hover_name=CT_FILE,
category_orders = {ISO3_COUNTRY: country_order},
log_y=False)
rankings.select(c_emissions_quantity.sum(), C(ERR_MARGIN).sum())
shape: (1, 2)
| emissions_quantity | err_margin |
|---|---|
| f64 | f64 |
| 3.7621e10 | 2.5173e10 |